PE with flexible adjustments
if(exists("con")) {
dbDisconnect(con)
remove(list=ls())
}
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(DBI)
library(ggsci)
library(DT)
library(ggrepel)
library(cowplot)
library(reactable)
library(gt)
library(broom)
con <- DBI::dbConnect(RSQLite::SQLite(), dbname='../db/pe_summary_stats_02_2024-v2.sqlite')
varnames <- tbl(con, "variable_names_epcf")
adjusted_meta <- tbl(con, "adjusted_meta")
#mvr2 <- tbl(con, 'mvr2') |> mutate(mv = mve_rsq-base_rsq)
mvr2 <- read_rds('../pe_summary_060623/mvr2.rds') |> mutate(mv = mve_rsq-base_rsq)
pe <- tbl(con, "pe")
pe_r2 <- tbl(con, "rsq")
glanced <- tbl(con, "glanced")
glanced <- glanced |> left_join(pe_r2 |> select(exposure, phenotype, series, model_number,aggregate_base_model, rsq, adj.r2), by=c("exposure", "phenotype", "model_number", "aggregate_base_model", "series"))
variable_domain <- tbl(con, "variable_domain")
adjusted_meta_uwls <- tbl(con, "adjusted_meta_uwls")
adjusted_meta <- adjusted_meta |> left_join(adjusted_meta_uwls |> rename(term_name=expo_name), by = c("term_name", "evarname", "pvarname","model_number"))
adjusted_meta <- adjusted_meta |> mutate(p.value=p.value.stouffer)
#p <- ggplot(adjusted_meta |> filter(nobs >= 2, model_number==2), aes((i.squared), (i.squared.uwls)))
#p <- p + geom_point()
#p
#p <- ggplot(adjusted_meta |> filter(nobs >= 2, model_number==2), aes(-log10(p.value.stouffer), -log10(p.value)))
#p <- p + geom_point()
#p
expos <- pe |> filter(term %like% 'expo%') |> rename(evarname=exposure, pvarname=phenotype)
expos_wide <- expos |> pivot_wider(names_from = "model_number", values_from = c("estimate", "std.error", "statistic", "p.value"))
glanced_wide <- glanced |> select(-c(adj.r2, df.residual, null.deviance, df.null, deviance)) |> pivot_wider(names_from=c("model_number", "aggregate_base_model"), values_from = c("rsq", "nobs", "AIC", "BIC")) |> rename(evarname=exposure, pvarname=phenotype)
glanced_wide <- glanced_wide |> mutate(rsq_adjusted_base_diff=rsq_2_0-rsq_2_1, rsq_adjusted_diff = rsq_2_0-rsq_1_0)
expos_wide <- expos_wide |> left_join(glanced_wide |> select(-c(series, log_p, log_e, scaled_p, scaled_e)), by=c("evarname", "pvarname", "exposure_table_name", "phenotype_table_name"))
expos_wide <- expos_wide |> left_join(varnames, by=c("evarname"="Variable.Name", "exposure_table_name"="Data.File.Name"))
expos_wide <- expos_wide |> left_join(varnames |> select(Variable.Name, Data.File.Name, Variable.Description, Data.File.Description),
by=c("pvarname"="Variable.Name", "phenotype_table_name"="Data.File.Name"))
expos_wide <- expos_wide |> collect() |> select(-Use.Constraints) |> rename(e_data_file_desc=Data.File.Description.x, p_data_file_desc=Data.File.Description.y,e_variable_description=Variable.Description.x, p_variable_description=Variable.Description.y)
expos_wide_summary <- expos_wide |> filter(term == 'expo' | term == 'expo1' | term == 'expo2') |> group_by(evarname, pvarname) |> summarize(mean_adjusted_base_r2_diff = mean(rsq_adjusted_base_diff), mean_unadjusted_r2_diff=mean(rsq_adjusted_diff), total_n = sum(nobs_2_0)) |> ungroup()
## `summarise()` has grouped output by 'evarname'. You can override using the
## `.groups` argument.
adjusted_meta_full <- adjusted_meta |> filter(model_number == 2) |> collect() |> left_join(expos_wide_summary, by=c("evarname", "pvarname")) ## fully adjusted model
p_variable_domain <- variable_domain |> filter(epcf == 'p') |> collect() |> group_by(Variable.Name) |> summarise(pvardesc=first(Variable.Description),pcategory=first(category),psubcategory=first(subcategory))
e_variable_domain <- variable_domain |> filter(epcf == 'e') |> collect() |> group_by(Variable.Name) |> summarise(evardesc=first(Variable.Description),ecategory=first(category),esubcategory=first(subcategory))
adjusted_meta_full <- adjusted_meta_full |> left_join(p_variable_domain, by=c("pvarname"="Variable.Name"))
adjusted_meta_full <- adjusted_meta_full |> left_join(e_variable_domain, by=c("evarname"="Variable.Name"))
expos_wide <- expos_wide |> left_join(p_variable_domain, by=c("pvarname"="Variable.Name"))
expos_wide <- expos_wide |> left_join(e_variable_domain, by=c("evarname"="Variable.Name"))
mvr2 <- mvr2 |> collect() |> left_join(p_variable_domain, by=c("pvarname"="Variable.Name"))
Number of unique exposures and phenotypes
num_e <- length(unique(adjusted_meta_full$evarname))
num_p <- length(unique(adjusted_meta_full$pvarname))
num_e
## [1] 854
num_p
## [1] 319
num_e * num_p
## [1] 272426
Number exposures and phenotypes and associations in X number of
surveys
#adjusted_meta <- adjusted_meta |> unnest(glanced) |> unnest(tidied)
n_obss <- sort(unique(adjusted_meta_full$nobs))
num_tests <- map_df(n_obss, function(n) {
n_e <- adjusted_meta_full |> filter(nobs == n) |> pull(evarname) |> unique() |> length()
n_p <- adjusted_meta_full |> filter(nobs == n) |> pull(pvarname) |> unique() |> length()
nn <- nrow(adjusted_meta_full |> filter(nobs == n))
tibble(n_expos=n_e, n_phenos=n_p, n_pxe=nn)
})
num_tests |> mutate(n_surveys=n_obss) |> gt()
| n_expos |
n_phenos |
n_pxe |
n_surveys |
| 854 |
319 |
64488 |
1 |
| 649 |
278 |
32439 |
2 |
| 528 |
241 |
17743 |
3 |
| 414 |
143 |
11084 |
4 |
| 355 |
117 |
4569 |
5 |
| 324 |
79 |
3736 |
6 |
| 205 |
64 |
2249 |
7 |
| 177 |
60 |
1602 |
8 |
| 28 |
60 |
593 |
9 |
| 6 |
29 |
60 |
10 |
Keep number of surveys is greater than 2
adjusted_meta_2 <- adjusted_meta_full |> filter(nobs >= 2)
n_evars <- length(unique(adjusted_meta_2$evarname))
n_pvars <- length(unique(adjusted_meta_2$pvarname))
Sample sizes within and across all surveys
sample_size_per_pair <- expos_wide |> filter(term == 'expo' | term== 'expo1') |> group_by(evarname, pvarname) |> summarize(total_n=sum(nobs_2_0), n_surveys=n(), median_n=median(nobs_2_0))
## `summarise()` has grouped output by 'evarname'. You can override using the
## `.groups` argument.
Summary of the summary stats
adjusted_meta_2 <- adjusted_meta_2 |> ungroup() |> mutate(pval_BY=p.adjust(p.value, method="BY"), pvalue_bonferroni=p.adjust(p.value, method="bonferroni"))
adjusted_meta_2 <- adjusted_meta_2 |> mutate(sig_levels = case_when(
pvalue_bonferroni < 0.05 ~ 'Bonf.<0.05',
pval_BY < 0.05 ~ 'BY<0.05',
TRUE ~ '> BY & Bonf.'
))
bonf_thresh <- 0.05/nrow(adjusted_meta_2)
quantile(adjusted_meta_2$p.value, probs=c(0.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99), na.rm = T)
## 1% 5% 10% 20% 30% 40%
## 6.304519e-42 3.198393e-12 9.465745e-07 1.204090e-03 1.558012e-02 6.279430e-02
## 50% 60% 70% 80% 90% 95%
## 1.474735e-01 2.675260e-01 4.240537e-01 6.039172e-01 7.991716e-01 8.995298e-01
## 99%
## 9.799332e-01
quantile(adjusted_meta_2$estimate, probs=c(0.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99), na.rm = T)
## 1% 5% 10% 20% 30% 40%
## -0.246857718 -0.093162824 -0.061155239 -0.034324215 -0.020507593 -0.010977694
## 50% 60% 70% 80% 90% 95%
## -0.002793128 0.005238097 0.015238188 0.029132279 0.057786585 0.100187654
## 99%
## 0.301700609
sum(adjusted_meta_2$pvalue_bonferroni < 0.05)/nrow(adjusted_meta_2)
## [1] 0.09754978
adjusted_meta_2 |> group_by(sig_levels) |> count()
adjusted_meta_2 |> filter(sig_levels == 'BY<0.05') |> arrange(-p.value) |> head()
adjusted_meta_2 |> group_by(sig_levels) |> summarize(r2_25=quantile(mean_adjusted_base_r2_diff, probs=.25, na.rm = T),
r2_50=quantile(mean_adjusted_base_r2_diff, probs=.5, na.rm = T),
r2_75=quantile(mean_adjusted_base_r2_diff, probs=.75, na.rm = T),
r2_100=max(mean_adjusted_base_r2_diff, na.rm = T))
adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> arrange(-mean_adjusted_base_r2_diff)
## qqplot
pval_qq <- data.frame(observed = sort(adjusted_meta_2$p.value), expected = (1:nrow(adjusted_meta_2))/nrow(adjusted_meta_2))
qq_p <- ggplot(pval_qq, aes(-log10(expected), -log10(observed)))
qq_p <- qq_p + geom_point()
##
p <- ggplot(pval_qq, aes(observed))
p <- p + geom_histogram(bins=100) + theme_bw()
p <- p + geom_hline(yintercept = 1, color='blue')
p <- p + xlab("P-E association pvalue")
p_hist <- p
p_hist

p <- ggplot(pval_qq |> filter(observed < 1e-3), aes(-log10(observed)))
p <- p + geom_histogram(bins=200) + theme_bw() + scale_x_continuous(limits=c(0, 100))
p <- p + geom_hline(yintercept = 1, color='blue')
p <- p + xlab("P-E association pvalue")
p_hist <- p
p_hist
## Warning: Removed 180 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

p <- ggplot(adjusted_meta_2, aes(p.value))
p <- p + geom_density() + theme_bw() + facet_grid(~ecategory)
p1 <- p + xlab("P-E association pvalue")
p <- ggplot(adjusted_meta_2, aes(p.value))
p <- p + geom_density() + theme_bw() + facet_grid(~pcategory)
p2 <- p + xlab("P-E association pvalue")
plot_grid(p1, p2, nrow=2, labels=c("A", "B"))

percent found for each E or P variable for each category
adjusted_meta_2 <- adjusted_meta_2 |> mutate(ecat=ifelse(is.na(esubcategory), ecategory, esubcategory))
adjusted_meta_2 <- adjusted_meta_2 |> mutate(pcat=ifelse(is.na(psubcategory), pcategory, psubcategory))
e_total_found <- adjusted_meta_2 |> group_by(ecat, sig_levels) |> summarize(n=n(), mean.r2=mean(mean_adjusted_base_r2_diff, na.rm=T))
## `summarise()` has grouped output by 'ecat'. You can override using the
## `.groups` argument.
e_total_found <- e_total_found |> ungroup() |> group_by(ecat) |> summarize(total_tests=sum(n)) |> left_join(e_total_found) |> mutate(pct_sig_per_category = n/total_tests, pct_sig_overall=n/sum(total_tests))
## Joining, by = "ecat"
p <- ggplot(e_total_found |> filter(sig_levels != "> BY & Bonf."), aes(ecat, pct_sig_overall))
p <- p + geom_bar(stat = "identity")
p

p <- ggplot(e_total_found |> filter(sig_levels != "> BY & Bonf."), aes(ecat, mean.r2))
p <- p + geom_bar(stat = "identity")
p

p <- ggplot(e_total_found |> filter(sig_levels == "BY<0.05"), aes(ecat, mean.r2))
p <- p + geom_bar(stat = "identity")
p

p <- ggplot(e_total_found |> filter(sig_levels == "BY<0.05"), aes(pct_sig_overall, mean.r2, label=ecat))
p <- p + geom_point() + theme_bw() + geom_text_repel()
p

ep_total_found <- adjusted_meta_2 |> group_by(ecat, pcat, sig_levels) |> summarize(n=n(), mean.r2=mean(mean_adjusted_base_r2_diff, na.rm=T))
## `summarise()` has grouped output by 'ecat', 'pcat'. You can override using the
## `.groups` argument.
ep_total_found <- ep_total_found |> ungroup() |> group_by(ecat,pcat) |> summarize(total_tests=sum(n)) |> left_join(ep_total_found) |> mutate(pct_sig_per_category = n/total_tests, pct_sig_overall=n/sum(total_tests))
## `summarise()` has grouped output by 'ecat'. You can override using the
## `.groups` argument.
## Joining, by = c("ecat", "pcat")
p <- ggplot(ep_total_found |> filter(sig_levels == "BY<0.05"), aes(pct_sig_overall, mean.r2, label=ecat))
p <- p + geom_point() + facet_wrap(~pcat) + theme_bw() + geom_text_repel()
p
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 18 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 20 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p_total_found <- adjusted_meta_2 |> group_by(pcat, sig_levels) |> summarize(n=n(), mean.r2=mean(mean_adjusted_base_r2_diff, na.rm=T))
## `summarise()` has grouped output by 'pcat'. You can override using the
## `.groups` argument.
p_total_found <- p_total_found |> ungroup() |> group_by(pcat) |> summarize(total_tests=sum(n)) |> left_join(p_total_found) |> mutate(pct_sig_per_category = n/total_tests, pct_sig_overall=n/sum(total_tests))
## Joining, by = "pcat"
p <- ggplot(p_total_found |> filter(sig_levels == "BY<0.05"), aes(pct_sig_overall*100, mean.r2*100, label=pcat))
p <- p + geom_point() + theme_bw() + geom_text_repel() + xlab("Less than BY 5% (%)") + ylab("R-squared (%)")
p

p <- ggplot(p_total_found |> filter(sig_levels == "BY<0.05"), aes(pct_sig_per_category*100, mean.r2*100, label=pcat))
p <- p + geom_point() + theme_bw() + geom_text_repel() + xlab("Less than BY 5% (% of all tests in category)") + ylab("R-squared (%)")
p

library(pwr)
power_from_r2 <- function(n, r2, u,pval) {
if(is.na(n) | is.na(r2)) {
return(NA)
}
v <- n - u - 1
pr <- pwr.f2.test(u=u,v=v,f2=r2/(1-r2),sig.level=pval)
pr$power
}
pwr_01 <- map_dbl(adjusted_meta_2$total_n, ~power_from_r2(.x, .01, 1, bonf_thresh))
pwr_001 <- map_dbl(adjusted_meta_2$total_n, ~power_from_r2(.x, .001, 1, bonf_thresh))
pwr_005 <- map_dbl(adjusted_meta_2$total_n, ~power_from_r2(.x, .005, 1, bonf_thresh))
adjusted_meta_2 <- adjusted_meta_2 |> mutate(power_01 = pwr_01, power_001 = pwr_001, power_005=pwr_005)
p <- ggplot(adjusted_meta_2, aes(total_n, power_01))
p <- p + geom_point(shape=1) + geom_rug(col=rgb(.5,0,0,alpha=.2)) + facet_wrap(~ecategory, ncol = 4) + geom_hline(yintercept = 0.8)
p1 <- p + theme_bw()
p <- ggplot(adjusted_meta_2, aes(total_n, power_01))
p <- p + geom_point(shape=1) + geom_rug(col=rgb(.5,0,0,alpha=.2)) + facet_wrap(~pcategory, ncol = 4) + geom_hline(yintercept = 0.8)
p2 <- p + theme_bw()
plot_grid(p1, p2)
## Warning: Removed 1119 rows containing missing values (geom_point).
## Removed 1119 rows containing missing values (geom_point).

zoom in the distribution
p_plot <- adjusted_meta_2 |> select(ecategory, pcategory, p.value)
p <- ggplot(p_plot,aes(p.value))
p <- p + geom_histogram(aes(y=..density..)) + geom_density() + theme_bw() + facet_grid(~ecategory)
p1 <- p + xlab("P-E association pvalue")
p <- ggplot(p_plot, aes(p.value))
p <- p + geom_histogram(aes(y=..density..)) + geom_density() + theme_bw() + facet_grid(~pcategory)
p2 <- p + xlab("P-E association pvalue")
plot_grid(p1, p2, nrow=2, labels=c("A", "B"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Association Size vs. -log10(pvalue)
e_summary <- adjusted_meta_2 |> group_by(evarname) |> arrange(pvalue_bonferroni) |>
summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T), mean_estimate=mean(abs(estimate), na.rm=T),
median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T),
n_tests=sum(!is.na(pvalue_bonferroni)), median_i.squared=median(i.squared, na.rm=T),
max_r2=first(mean_adjusted_base_r2_diff), max_pvarname=first(pvarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)
p_summary <- adjusted_meta_2 |> group_by(pvarname) |> arrange(pvalue_bonferroni) |>
summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T), mean_estimate=mean(abs(estimate), na.rm=T),
median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T),
n_tests=sum(!is.na(pvalue_bonferroni)), median_i.squared=median(i.squared, na.rm=T),
max_r2=first(mean_adjusted_base_r2_diff), max_evarname=first(evarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)
## deeper summary by group
p_group_summary <- adjusted_meta_2 |> unite(p_scategory, c(pcategory, psubcategory)) |> group_by(p_scategory) |> arrange(pvalue_bonferroni) |>
summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T), mean_estimate=mean(abs(estimate), na.rm=T),
median_pvalue=median(p.value, na.rm=T), n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T),
n_tests=sum(!is.na(pvalue_bonferroni)), median_i.squared=median(i.squared, na.rm=T),
max_r2=first(mean_adjusted_base_r2_diff), max_evarname=first(evarname) , max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)
e_group_summary <- adjusted_meta_2 |> unite(e_scategory, c(ecategory, esubcategory)) |> group_by(e_scategory) |> arrange(pvalue_bonferroni) |>
summarize(mean_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),
mean_abs_estimate=mean(abs(estimate), na.rm=T),
mean_estimate=mean((estimate), na.rm=T),
median_pvalue=median(p.value, na.rm=T),
n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T),
n_tests=sum(!is.na(pvalue_bonferroni)),
median_i.squared=median(i.squared, na.rm=T),
max_r2=first(mean_adjusted_base_r2_diff),
max_pvarname=first(pvarname),
max_estimate=first(estimate), max_p.value=first(p.value)) |> mutate(n_sig_pct=n_sig/n_tests)
Tables 1 and 2
##
remove_units_from_string <- function(vardesc) {
gsub("\\(.*\\)$","", vardesc)
}
e_group_summary <- adjusted_meta_2 |> filter(term_name %in% c('expo', 'expo1', 'expo2', 'expo3')) |> unite(e_scategory, c(ecategory, esubcategory)) |> group_by(e_scategory) |> arrange(pvalue_bonferroni,mean_adjusted_base_r2_diff) |>
summarize(
#median_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),
#median_abs_estimate=median(abs(estimate), na.rm=T),
n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T),
n_tests=sum(!is.na(pvalue_bonferroni)),
#median_i.squared=median(i.squared, na.rm=T),
max_r2=first(mean_adjusted_base_r2_diff),
max_termname=remove_units_from_string(first(term_name)),
max_pvarname=remove_units_from_string(first(pvardesc)),
max_evarname=remove_units_from_string(first(evardesc)),
max_estimate=first(estimate), max_p.value=first(p.value), max_i.squared=first(i.squared)) |> mutate(n_sig_pct=n_sig/n_tests)
e_bonf_group_summary <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |>
unite(e_scategory, c(ecategory, esubcategory)) |>
mutate(sgn=ifelse(sign(estimate) <0, 'neg', 'pos')) |> group_by(e_scategory, sgn) |> arrange(pvalue_bonferroni) |>
summarize(median_bonf_r2=median(mean_adjusted_base_r2_diff, na.rm=T),
q25_bonf_estimate=quantile(estimate, probs=.25, na.rm=T),
median_bonf_estimate=median((estimate), na.rm=T),
q75_bonf_estimate=quantile(estimate, probs=.75, na.rm=T)
)
## `summarise()` has grouped output by 'e_scategory'. You can override using the
## `.groups` argument.
e_bonf_group_summary <- e_bonf_group_summary |> pivot_wider(names_from=sgn, values_from=(c(median_bonf_r2, q25_bonf_estimate, median_bonf_estimate, q75_bonf_estimate)))
## merge
e_group_summary <- e_group_summary |> left_join(e_bonf_group_summary, by='e_scategory')
e_group_summary <- e_group_summary |> separate(col=e_scategory, into=c("escategory", "esubcategory"), sep="_")
e_group_summary <- e_group_summary |> filter(n_sig > 1)
e_group_summary <- e_group_summary |> select(escategory, esubcategory, n_tests, n_sig_pct, median_bonf_r2_neg, median_bonf_r2_pos, q25_bonf_estimate_neg, median_bonf_estimate_neg, q75_bonf_estimate_neg, q25_bonf_estimate_pos, median_bonf_estimate_pos, q75_bonf_estimate_pos,
max_pvarname, max_evarname, max_estimate, max_r2, max_p.value, max_i.squared)
e_group_summary<- e_group_summary |> gt() |>
fmt_number(columns = c(starts_with("q"), starts_with("med")), decimals = 3) |>
fmt_number(columns = c(max_r2, max_estimate), decimals = 3) |>
fmt_number(columns = c(n_sig_pct), decimals = 2) |>
fmt_number(columns = c(max_i.squared), decimals = 0) |>
fmt_scientific(columns = max_p.value, decimals = 0)
e_group_summary <- e_group_summary |>
tab_spanner(label = "Effect Sizes < 0",columns = ends_with("neg")) |>
tab_spanner(label = "Effect Sizes > 0",columns = ends_with("pos"))
e_group_summary <- e_group_summary |>
tab_spanner(label = "Example P-E Association (lowest p.value for E domain)",columns = starts_with("max"))
e_group_summary |> gtsave('./e_group_summary_v2.html')
e_group_summary
| escategory |
esubcategory |
n_tests |
n_sig_pct |
Effect Sizes < 0
|
Effect Sizes > 0
|
Example P-E Association (lowest p.value for E domain)
|
| median_bonf_r2_neg |
q25_bonf_estimate_neg |
median_bonf_estimate_neg |
q75_bonf_estimate_neg |
median_bonf_r2_pos |
q25_bonf_estimate_pos |
median_bonf_estimate_pos |
q75_bonf_estimate_pos |
max_pvarname |
max_evarname |
max_estimate |
max_r2 |
max_p.value |
max_i.squared |
| allergy |
NA |
404 |
0.04 |
0.001 |
−1.355 |
−1.355 |
−1.355 |
0.001 |
0.254 |
0.337 |
0.383 |
Android/gynoid region status |
Tissue transglutaminase |
0.321 |
0.001 |
5 × 10−17 |
49 |
| housing |
NA |
534 |
0.01 |
0.001 |
−0.426 |
−0.332 |
−0.230 |
0.001 |
0.238 |
0.413 |
0.607 |
Alkaline phosphotase |
Is this {mobile home/house/apartment} owned, being bought, rented, or occup |
−0.102 |
0.002 |
4 × 10−21 |
33 |
| infection |
NA |
4783 |
0.09 |
0.001 |
−0.684 |
−0.420 |
−0.295 |
0.001 |
0.143 |
0.329 |
0.557 |
Iron, refigerated |
Hepatitis D |
−1.099 |
0.001 |
0 |
99 |
| nutrients |
dietary biomarker |
4949 |
0.25 |
0.008 |
−0.143 |
−0.095 |
−0.068 |
0.009 |
0.073 |
0.100 |
0.160 |
Android percent fat |
g-tocopherol |
0.278 |
0.073 |
0 |
91 |
| nutrients |
dietary interview |
14246 |
0.07 |
0.003 |
−0.088 |
−0.065 |
−0.049 |
0.004 |
0.058 |
0.076 |
0.104 |
Direct HDL-Cholesterol |
Alcohol |
0.206 |
0.038 |
3 × 10−120 |
7 |
| nutrients |
supplements |
10466 |
0.01 |
0.005 |
−0.117 |
−0.074 |
−0.064 |
0.007 |
0.073 |
0.094 |
0.136 |
5-Methyl-tetrahydrofolate |
Any Dietary Supplements taken in the past 24 hour? |
−0.534 |
0.059 |
9 × 10−150 |
60 |
| physical activity |
NA |
2826 |
0.19 |
0.005 |
−0.093 |
−0.070 |
−0.051 |
0.004 |
0.047 |
0.059 |
0.083 |
60 sec. pulse (30 sec. pulse * 2): |
MET Total |
−0.133 |
0.015 |
5 × 10−143 |
43 |
| pollutant |
amide |
198 |
0.30 |
0.007 |
−0.107 |
−0.084 |
−0.060 |
0.005 |
0.063 |
0.076 |
0.107 |
White blood cell count |
Acrylamide |
0.143 |
0.018 |
4 × 10−39 |
12 |
| pollutant |
amine |
156 |
0.16 |
0.011 |
−0.137 |
−0.119 |
−0.101 |
0.016 |
0.145 |
0.164 |
0.184 |
Glycohemoglobin |
o-Toluidine, urine |
0.203 |
0.018 |
2 × 10−23 |
27 |
| pollutant |
diakyl |
2148 |
0.05 |
0.004 |
−0.084 |
−0.068 |
−0.054 |
0.007 |
0.053 |
0.080 |
0.108 |
Mean cell volume |
Urinary thiocyanate |
0.135 |
0.016 |
4 × 10−76 |
81 |
| pollutant |
heavy metals |
6581 |
0.17 |
0.006 |
−0.119 |
−0.094 |
−0.073 |
0.005 |
0.052 |
0.076 |
0.105 |
Weight |
Lead |
−0.173 |
0.021 |
4 × 10−171 |
25 |
| pollutant |
hydrocarbon |
1647 |
0.16 |
0.007 |
−0.120 |
−0.097 |
−0.078 |
0.007 |
0.082 |
0.102 |
0.128 |
White blood cell count |
2-fluorene |
0.213 |
0.032 |
1 × 10−53 |
61 |
| pollutant |
organochlorine |
4885 |
0.08 |
0.018 |
−0.284 |
−0.178 |
−0.128 |
0.020 |
0.128 |
0.167 |
0.231 |
Thigh Circumference |
PCB180 |
−0.400 |
0.051 |
2 × 10−56 |
93 |
| pollutant |
organophosphate |
423 |
0.11 |
0.001 |
−0.036 |
−0.019 |
−0.014 |
0.001 |
0.019 |
0.021 |
0.030 |
Subscapular Skinfold |
Propylenethio urea |
−0.040 |
0.002 |
5 × 10−73 |
96 |
| pollutant |
phenols |
2173 |
0.06 |
0.008 |
−0.124 |
−0.101 |
−0.075 |
0.006 |
0.061 |
0.084 |
0.101 |
Weight |
Ethyl paraben |
−0.124 |
0.014 |
8 × 10−50 |
0 |
| pollutant |
phthalates |
2306 |
0.10 |
0.005 |
−0.104 |
−0.081 |
−0.063 |
0.004 |
0.057 |
0.074 |
0.094 |
Transferrin receptor |
Mono-n-octyl phthalate |
0.082 |
0.018 |
2 × 10−104 |
98 |
| pollutant |
polyfluoro |
2346 |
0.09 |
0.010 |
−0.125 |
−0.099 |
−0.069 |
0.007 |
0.052 |
0.080 |
0.112 |
L3 Bone Mineral Composition |
Perfluorooctane sulfonamide |
0.039 |
0.003 |
2 × 10−73 |
96 |
| pollutant |
priority pesticide |
2502 |
0.10 |
0.001 |
−0.044 |
−0.017 |
−0.010 |
0.001 |
0.011 |
0.016 |
0.030 |
Apolipoprotein |
2,4,5-T (ug/L) result |
0.054 |
0.004 |
6 × 10−280 |
100 |
| pollutant |
VOC |
4492 |
0.12 |
0.001 |
−0.089 |
−0.027 |
−0.015 |
0.001 |
0.016 |
0.026 |
0.088 |
Mefox oxidation product |
Blood Chlorobenzene Result |
0.062 |
0.005 |
0 |
100 |
| smoking |
smoking behavior |
1873 |
0.06 |
0.007 |
−0.294 |
−0.213 |
−0.157 |
0.008 |
0.118 |
0.186 |
0.321 |
Mean cell volume |
Smoking: current, ever, never |
0.482 |
0.033 |
3 × 10−203 |
28 |
| smoking |
smoking biomarker |
1088 |
0.13 |
0.004 |
−0.086 |
−0.071 |
−0.055 |
0.008 |
0.066 |
0.104 |
0.162 |
White blood cell count |
Cotinine |
0.177 |
0.027 |
1 × 10−234 |
83 |
##
p_group_summary <- adjusted_meta_2 |> filter(term_name %in% c('expo', 'expo1', 'expo2', 'expo3')) |> filter(mean_adjusted_base_r2_diff <= .1) |> unite(p_scategory, c(pcategory, psubcategory)) |> group_by(p_scategory) |> arrange(pvalue_bonferroni,mean_adjusted_base_r2_diff) |>
summarize(
#median_r2=mean(mean_adjusted_base_r2_diff, na.rm=T),
#median_abs_estimate=median(abs(estimate), na.rm=T),
n_sig=sum(pvalue_bonferroni < 0.05, na.rm=T),
n_tests=sum(!is.na(pvalue_bonferroni)),
#median_i.squared=median(i.squared, na.rm=T),
max_r2=first(mean_adjusted_base_r2_diff),
max_termname=remove_units_from_string(first(term_name)),
max_pvarname=remove_units_from_string(first(pvardesc)),
max_evarname=remove_units_from_string(first(evardesc)),
max_estimate=first(estimate), max_p.value=first(p.value), max_i.squared=first(i.squared)) |> mutate(n_sig_pct=n_sig/n_tests)
p_bonf_group_summary <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |>
unite(p_scategory, c(pcategory, psubcategory)) |>
mutate(sgn=ifelse(sign(estimate) <0, 'neg', 'pos')) |> group_by(p_scategory, sgn) |> arrange(pvalue_bonferroni) |>
summarize(median_bonf_r2=median(mean_adjusted_base_r2_diff, na.rm=T),
q25_bonf_estimate=quantile(estimate, probs=.25, na.rm=T),
median_bonf_estimate=median((estimate), na.rm=T),
q75_bonf_estimate=quantile(estimate, probs=.75, na.rm=T)
)
## `summarise()` has grouped output by 'p_scategory'. You can override using the
## `.groups` argument.
p_bonf_group_summary <- p_bonf_group_summary |> pivot_wider(names_from=sgn, values_from=(c(median_bonf_r2, q25_bonf_estimate, median_bonf_estimate, q75_bonf_estimate)))
## merge
p_group_summary <- p_group_summary |> left_join(p_bonf_group_summary, by='p_scategory')
p_group_summary <- p_group_summary |> separate(col=p_scategory, into=c("pscategory", "psubcategory"), sep="_")
p_group_summary <- p_group_summary |> filter(n_sig > 1)
p_group_summary <- p_group_summary |> select(pscategory, psubcategory, n_tests, n_sig_pct, q25_bonf_estimate_neg, median_bonf_estimate_neg, q75_bonf_estimate_neg, q25_bonf_estimate_pos, median_bonf_estimate_pos, q75_bonf_estimate_pos,
max_pvarname, max_evarname, max_estimate, max_r2, max_p.value, max_i.squared)
p_group_summary<- p_group_summary |> gt() |>
fmt_number(columns = c(starts_with("q"), starts_with("med")), decimals = 3) |>
fmt_number(columns = c(max_r2, max_estimate), decimals = 3) |>
fmt_number(columns = c(n_sig_pct), decimals = 2) |>
fmt_number(columns = c(max_i.squared), decimals = 0) |>
fmt_scientific(columns = max_p.value, decimals = 0)
p_group_summary <- p_group_summary |>
tab_spanner(label = "Effect Sizes < 0",columns = ends_with("neg")) |>
tab_spanner(label = "Effect Sizes > 0",columns = ends_with("pos"))
p_group_summary <- p_group_summary |>
tab_spanner(label = "Example P-E Association (lowest p.value for E domain)",columns = starts_with("max"))
p_group_summary |> gtsave('./p_group_summary_v2.html')
p_group_summary
| pscategory |
psubcategory |
n_tests |
n_sig_pct |
Effect Sizes < 0
|
Effect Sizes > 0
|
Example P-E Association (lowest p.value for E domain)
|
| q25_bonf_estimate_neg |
median_bonf_estimate_neg |
q75_bonf_estimate_neg |
q25_bonf_estimate_pos |
median_bonf_estimate_pos |
q75_bonf_estimate_pos |
max_pvarname |
max_evarname |
max_estimate |
max_r2 |
max_p.value |
max_i.squared |
| aging |
NA |
58 |
0.03 |
NA |
NA |
NA |
0.371 |
0.435 |
0.499 |
Asso. Std. Dev. of Mean T/S ratio |
Hepatitis D |
0.306 |
0.000 |
7 × 10−21 |
97 |
| anthropometric |
dexa |
24753 |
0.08 |
−0.130 |
−0.097 |
−0.074 |
0.073 |
0.122 |
0.341 |
Android percent fat |
g-tocopherol |
0.278 |
0.073 |
0 |
91 |
| anthropometric |
NA |
5953 |
0.20 |
−0.124 |
−0.081 |
−0.051 |
0.052 |
0.072 |
0.144 |
Waist Circumference |
cis-b-carotene |
−0.253 |
0.056 |
6 × 10−304 |
64 |
| biochemistry |
bone |
849 |
0.12 |
−0.090 |
−0.073 |
−0.052 |
0.065 |
0.088 |
0.126 |
Total calcium |
Retinol |
0.236 |
0.046 |
3 × 10−134 |
0 |
| biochemistry |
electrolyte |
1925 |
0.08 |
−0.103 |
−0.078 |
−0.036 |
0.062 |
0.082 |
0.106 |
Chloride |
Blood Chlorobenzene Result |
−0.053 |
0.003 |
9 × 10−206 |
0 |
| biochemistry |
hormone |
3349 |
0.04 |
−0.115 |
−0.085 |
−0.069 |
0.048 |
0.070 |
0.087 |
Parathyroid Hormone(Elecys method) pg/mL |
Vitamin D |
−0.269 |
0.058 |
7 × 10−84 |
14 |
| biochemistry |
immunity |
385 |
0.14 |
−0.107 |
−0.071 |
−0.056 |
0.066 |
0.082 |
0.109 |
Globulin |
Metsulfuron methyl |
−0.028 |
0.001 |
3 × 10−61 |
0 |
| biochemistry |
inflammation |
670 |
0.10 |
−0.095 |
−0.071 |
−0.055 |
0.080 |
0.111 |
0.155 |
C-reactive protein |
Blood 1,1,1-Trichloroethane Result |
0.070 |
0.006 |
3 × 10−233 |
95 |
| biochemistry |
injury |
362 |
0.06 |
−0.158 |
−0.064 |
−0.025 |
0.056 |
0.071 |
0.130 |
Lactate dehydrogenase LDH |
Metsulfuron methyl |
−0.021 |
0.001 |
4 × 10−57 |
97 |
| biochemistry |
kidney |
1796 |
0.17 |
−0.118 |
−0.083 |
−0.061 |
0.068 |
0.111 |
0.166 |
Creatinine |
Blood Chlorobenzene Result |
0.117 |
0.025 |
0 |
0 |
| biochemistry |
lipids |
3278 |
0.10 |
−0.115 |
−0.066 |
−0.039 |
0.077 |
0.187 |
0.296 |
Apolipoprotein |
2,4,5-T (ug/L) result |
0.054 |
0.004 |
6 × 10−280 |
100 |
| biochemistry |
liver |
1880 |
0.11 |
−0.122 |
−0.085 |
−0.036 |
0.053 |
0.072 |
0.098 |
Gamma glutamyl transferase |
Hepatitis D |
−1.375 |
0.002 |
0 |
100 |
| biochemistry |
liver/kidney |
1390 |
0.12 |
−0.100 |
−0.080 |
−0.062 |
0.047 |
0.078 |
0.110 |
Albumin |
Pyridoxal 5'-phosphate |
0.252 |
0.060 |
2 × 10−177 |
52 |
| biochemistry |
metabolic |
2743 |
0.11 |
−0.112 |
−0.080 |
−0.051 |
0.052 |
0.088 |
0.160 |
Glucose, serum |
Blood Chlorobenzene Result |
0.020 |
0.001 |
1 × 10−90 |
83 |
| biochemistry |
nutritional status |
4019 |
0.13 |
−0.136 |
−0.099 |
−0.073 |
0.087 |
0.116 |
0.148 |
Mefox oxidation product |
Blood Chlorobenzene Result |
0.062 |
0.005 |
0 |
100 |
| blood pressure |
NA |
2710 |
0.07 |
−0.095 |
−0.063 |
−0.046 |
0.058 |
0.088 |
0.110 |
Systolic Mean |
Sulfosulfuron |
−0.034 |
0.001 |
2 × 10−255 |
100 |
| blood |
iron |
2223 |
0.05 |
−0.174 |
−0.118 |
−0.078 |
0.070 |
0.095 |
0.145 |
Iron, refigerated |
Hepatitis D |
−1.099 |
0.001 |
0 |
99 |
| blood |
NA |
9004 |
0.10 |
−0.093 |
−0.059 |
−0.044 |
0.053 |
0.087 |
0.130 |
Mean cell volume |
Hepatitis D |
1.521 |
0.001 |
0 |
100 |
| fitness |
NA |
663 |
0.07 |
−0.577 |
−0.151 |
−0.099 |
0.107 |
0.125 |
0.138 |
Predicted maximal oxygen uptake |
g-tocopherol |
−0.156 |
0.023 |
3 × 10−76 |
0 |
| lung |
exhaled NO |
332 |
0.11 |
−0.198 |
−0.154 |
−0.134 |
0.052 |
0.056 |
0.066 |
Mean of two reproducible FENO measurements, in parts per billion |
Cotinine |
−0.198 |
0.031 |
6 × 10−90 |
29 |
| lung |
lung function |
546 |
0.10 |
−0.093 |
−0.076 |
−0.063 |
0.042 |
0.053 |
0.140 |
Baseline 1st Test Spirometry, Forced Expiratory Volume in the first 1.0 sec |
Hepatitis A Antibody |
0.823 |
0.000 |
2 × 10−93 |
98 |
| microbiome |
NA |
2224 |
0.05 |
−0.029 |
−0.023 |
−0.015 |
0.026 |
0.106 |
0.153 |
RSV_InverseSimpsonSD |
Blood 1,2-Dichlorobenzene Result |
−0.027 |
0.001 |
4 × 10−56 |
83 |
adjusted_meta_2 <- adjusted_meta_2 |> mutate(p_cap = ifelse(p.value < 1e-30, 1e-30, p.value))
p <- ggplot(adjusted_meta_2 |> filter(ecategory != 'autoantibody'), aes(estimate, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1))
p <- p + facet_grid(ecategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p1 <- p
## uBiome only
p <- ggplot(adjusted_meta_2 |> filter(pcategory == 'microbiome'), aes(estimate, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1))
p <- p + facet_grid(ecategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p <- ggplot(adjusted_meta_2, aes(estimate, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1))
p <- p + facet_grid(pcategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p2 <-p
plot_grid(p1, p2, ncol=2, labels=c("A", "B"))
## Warning: Removed 55 rows containing missing values (geom_point).
## Removed 55 rows containing missing values (geom_point).

adjusted_meta_2 <- adjusted_meta_2 |> mutate(p_cap = ifelse(p.value < 1e-30, 1e-30, p.value))
p <- ggplot(adjusted_meta_2 |> filter(ecategory != 'autoantibody'), aes(mean_adjusted_base_r2_diff, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(0, .2))
p <- p + facet_grid(ecategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p1 <- p
p <- ggplot(adjusted_meta_2, aes(mean_adjusted_base_r2_diff, -log10(p_cap)))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(0, .2))
p <- p + facet_grid(pcategory ~ .) + scale_color_npg()
p <- p + geom_hline(yintercept = -log10(.05/nrow(adjusted_meta_2)), color='lightblue')
p <- p + theme_minimal() + theme(legend.position = "none") +ylab('p.value') + xlab("estimate")
p2 <-p
plot_grid(p1, p2, ncol=2, labels=c("A", "B"))
## Warning: Removed 1130 rows containing missing values (geom_point).
## Warning: Removed 1130 rows containing missing values (geom_point).

e_effect_sizes_per <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> group_by(ecategory, esubcategory, sign(estimate)) |> summarize(median_pvalue=median(p.value), number_signficant=n(), mean_estimate=mean((estimate))) |> arrange(-mean_estimate)
## `summarise()` has grouped output by 'ecategory', 'esubcategory'. You can
## override using the `.groups` argument.
e_effect_sizes_per <- e_effect_sizes_per |> mutate(esubcategory = ifelse(is.na(esubcategory), ecategory, esubcategory))
p <- ggplot(e_effect_sizes_per, aes(mean_estimate, -log10(median_pvalue), label=esubcategory))
p <- p + geom_point(aes(size=number_signficant)) + geom_text_repel() + geom_vline(xintercept = 0)
p <- p + theme_bw() + xlab("Average(Estimate) within exposome groups") + ylab("Median log10(pvalue)")
p <- p + theme(legend.position = "bottom")
p
## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p_effect_sizes_per <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> group_by(pcategory, psubcategory) |> summarize(median_pvalue=median(p.value), number_signficant=n(), mean_r2=mean((mean_adjusted_base_r2_diff)))
## `summarise()` has grouped output by 'pcategory'. You can override using the
## `.groups` argument.
p <- ggplot(p_effect_sizes_per, aes(mean_r2, -log10(median_pvalue), label=psubcategory))
p <- p + geom_point(aes(size=number_signficant)) + geom_text_repel() + geom_vline(xintercept = 0)
p <- p + theme_bw() + xlab("Average(Estimate) within Phenome groups") + ylab("Median log10(pvalue)")
p <- p + theme(legend.position = "bottom")
p
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_text_repel).

adjusted_meta_2 <- adjusted_meta_2 |> mutate(vartype = ifelse(term_name == 'expo', 'continuous', 'categorical'))
p <- ggplot(adjusted_meta_2 |> filter(vartype =='categorical'), aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile") + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 1574 rows containing non-finite values (stat_ecdf).

p <- ggplot(adjusted_meta_2 |> filter(vartype =='continuous'), aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile") + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 192 rows containing non-finite values (stat_ecdf).

p <- ggplot(adjusted_meta_2, aes(abs(estimate), color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .25))
p <- p + xlab("abs(estimate)") + ylab("percentile") + theme(legend.position="bottom") + scale_color_npg()
p
## Warning: Removed 1766 rows containing non-finite values (stat_ecdf).

ecdf_for_sig <- adjusted_meta_2 |> filter(sig_levels == 'Bonf.<0.05') |> pull(mean_adjusted_base_r2_diff) |> ecdf()
ecdf_for_non_sig <- adjusted_meta_2 |> filter(sig_levels == '> BY & Bonf.') |> pull(mean_adjusted_base_r2_diff) |> ecdf()
p_effect_sizes_per <- p_effect_sizes_per |> mutate(q = ecdf_for_sig(mean_r2), sig_levels ='Bonf.<0.05')
p_effect_sizes_per <- p_effect_sizes_per |> mutate(p_cat = ifelse(is.na(psubcategory), pcategory, psubcategory))
p <- ggplot(adjusted_meta_2, aes(mean_adjusted_base_r2_diff, color=sig_levels))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .05)) +scale_color_aaas()
p <- p + geom_point(data=p_effect_sizes_per, aes(x=mean_r2, y = q, color=sig_levels))
p <- p + geom_text_repel(data=p_effect_sizes_per, aes(x=mean_r2, y = q, color=sig_levels, label=p_cat))
p <- p + xlab("R^2 (adjusted-base model)") + ylab("percentile")
p <- p + theme_bw() + theme(legend.position="bottom")
p
## Warning: Removed 1295 rows containing non-finite values (stat_ecdf).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_text_repel).

number found at different power
adjusted_meta_2 <- adjusted_meta_2 |> mutate(power_groups_01=cut(power_01, breaks=c(0, .4, .8, 1)))
adjusted_meta_2 <- adjusted_meta_2 |> mutate(power_groups_005=cut(power_005, breaks=c(0, .4, .8, 1)))
discovery_by_power_01 <- adjusted_meta_2 |> filter(!is.na(power_01)) |> group_by(sig_levels, power_groups_01) |> count()
discovery_by_power_005 <- adjusted_meta_2 |> filter(!is.na(power_005)) |> group_by(sig_levels, power_groups_005) |> count()
## reorder sig_levels
adjusted_meta_2 <- adjusted_meta_2 |> mutate(sig_levels=fct_relevel(sig_levels, c("> BY & Bonf.","BY<0.05", "Bonf.<0.05")))
p <- ggplot(adjusted_meta_2, aes(factor(nobs), i.squared,color=sig_levels))
p <- p + geom_boxplot() + xlab("Number of Surveys for PE Association") + theme(legend.position="bottom") + scale_color_aaas()
p <- p + theme_bw() + theme(legend.position="bottom") + ylab("i-squared")
p

i2_medians <- adjusted_meta_2 |> group_by(sig_levels) |> summarize(i2_median=median(i.squared))
Showcasing associations:
- Low pvalue, Higher R2, low I2
- Low pvalue, Higher R2, and higher I2
adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05) |> nrow()
## [1] 7226
adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2) |> nrow()
## [1] 0
non_het_pairs <- adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2, i.squared < 50, mean_adjusted_base_r2_diff > .025)
het_pairs <- adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05, n_pvalue_lt >= 2, i.squared > 50, mean_adjusted_base_r2_diff > .025, nobs >= 4)
#het_pairs_2 <- adjusted_meta_3 |> filter(sig_levels == 'BY<0.05', i.squared > 90, nobs >= 4)
adjusted_meta_3 |> filter(pvalue_bonferroni < 0.05) |> group_by(ecategory) |> count()
## non-heterogeneous example
plot_pair <- function(evarname_str, pvarname_str, estimate_limits=c(0.01,.35)) {
test_1 <- expos_wide |> filter(evarname == evarname_str, pvarname == pvarname_str) |> select(Begin.Year, exposure_table_name, phenotype_table_name, e_variable_description, p_variable_description, estimate_adjusted, std.error_adjusted, p.value_adjusted)
exposure_name <- remove_units_from_string(test_1$e_variable_description[1])
phenotype_name <- remove_units_from_string(test_1$p_variable_description[1])
test_1 <- test_1 |> select(Begin.Year, estimate_adjusted, std.error_adjusted) |> rename(estimate=estimate_adjusted, std.error = std.error_adjusted, Survey=Begin.Year) |> mutate(i.squared = NA, i.squared_text='')
meta_test_1 <- adjusted_meta_2 |> filter(evarname == evarname_str, pvarname == pvarname_str) |> mutate(Survey = 'overall') |> select(Survey, estimate, std.error, i.squared) |> mutate(i.squared_text = sprintf("%i%%", round(i.squared)))
test_1 <- test_1 |> rbind(meta_test_1)
test_1 <- test_1 |> mutate(point_shape = ifelse(Survey == 'overall', 23, 21)) |> mutate(point_shape = as.integer(point_shape))
test_1 <- test_1 |> mutate(point_size = ifelse(Survey == 'overall', 7, 2))
p <- ggplot(test_1, aes(Survey, estimate))
p <- p + geom_point(aes(shape=point_shape, size=point_size, fill=point_shape)) + scale_shape_identity() + scale_size_identity()
p <- p + geom_text(data=meta_test_1, aes(Survey, estimate, label=i.squared_text), size=3, color="black", nudge_x=.6)
p <- p + geom_errorbar(aes(ymin=estimate-1.96*std.error, ymax=estimate+1.96*std.error), width=.1) + scale_x_discrete(limits=rev)
p <- p + scale_y_continuous(limits=estimate_limits)
p <- p + coord_flip() + theme_bw() + theme(legend.position = "none")
p <- p + ggtitle(sprintf('scale(%s)-scale(log10(%s))', phenotype_name, exposure_name))+ theme(plot.title = element_text(size = 7))
}
## non-heterogeneous example
p1 <- plot_pair('URXP01', 'LBDNENO')
## heterogeneous example
p2 <- plot_pair('LBXGTC', 'BMXBMI')
p3 <- plot_pair('LBXBPB', 'BMXHT', c()) # 33% i2
p4 <- plot_pair('LBXCOT', 'BPXPLS', c()) # 33% i2
#expos_wide |> filter(evarname == 'LBXPFOS', pvarname == 'LBXSAL')
plot_grid(p1, p2, p3, p4, ncol=2,labels = c('A', 'B', "C", "D"), label_size = 12)

# Examples for paper of top hits
rbind(
adjusted_meta_2 |> filter(evarname == 'LBXPFHS', pvarname == 'LBXSAL') |> select(tau.squared),
adjusted_meta_2 |> filter(evarname == 'LBXGTC', pvarname == 'LBXTC') |> select(tau.squared),
adjusted_meta_2 |> filter(evarname == 'LBXBPB', pvarname == 'BMXWT') |> select(tau.squared),
adjusted_meta_2 |> filter(evarname == 'LBXCOT', pvarname == 'LBDNENO') |> select(tau.squared)
)
adjusted_meta_2 |> filter(evarname == 'LBXPFHS') |> group_by(sig_levels) |> summarize(sd_estimate=sd(estimate))
adjusted_meta_2 |> filter(evarname == 'LBXGTC') |> group_by(sig_levels) |> summarize(sd_estimate=sd(estimate))
adjusted_meta_2 |> filter(evarname == 'LBXBPB') |> group_by(sig_levels) |> summarize(sd_estimate=sd(estimate))
adjusted_meta_2 |> filter(evarname == 'LBXCOT') |> group_by(sig_levels) |> summarize(sd_estimate=sd(estimate))
p1 <- plot_pair('LBXPFHS', 'LBXSAL') + ylab("Association Estimate")
## heterogeneous example
p2 <- plot_pair('LBXGTC', 'LBXTC') + ylab("Association Estimate")
p3 <- plot_pair('LBXBPB', 'BMXWT', c(-.3, 0)) + ylab("Association Estimate")
p4 <- plot_pair('LBXCOT', 'LBDNENO')+ ylab("Association Estimate")
plot_grid(p1, p2, p3, p4, ncol=2,labels = c('A', 'B', "C", "D"), label_size = 12)

library(ggridges)
## show histogram of associations for all top findings for LBXGTC, "LBXBPB", "LBXPFHS", "LBXCOT"
ep_candidates <- tibble(evarname = c("LBXGTC", "LBXBPB", "LBXPFHS", "LBXCOT"),
pvarname = c("LBXTC", "BMXWT", "LBXSAL", "LBDNENO"))
exposure_dist <- adjusted_meta_2 |> filter(evarname %in% ep_candidates$evarname) |> filter(sig_levels == "Bonf.<0.05") |> mutate(evardesc = remove_units_from_string(evardesc))
## collect Survey specific associations
survey_exposure_pts <- vector(mode = "list", length=nrow(ep_candidates))
for(rw_num in 1:nrow(ep_candidates)) {
survey_exposure_pts[[rw_num]] <- expos_wide |> filter(evarname == ep_candidates$evarname[rw_num], pvarname == ep_candidates$pvarname[rw_num]) |>
select(Begin.Year, evarname, pvarname, exposure_table_name, phenotype_table_name, e_variable_description, p_variable_description, estimate_adjusted, std.error_adjusted, p.value_adjusted)
}
survey_exposure_pts <- survey_exposure_pts |> bind_rows() |> mutate(evardesc=remove_units_from_string(e_variable_description))
exposure_dist <- exposure_dist |> mutate(evarname = substr(evarname, 4, 8))
survey_exposure_pts <- survey_exposure_pts |> mutate(pvarname = substr(pvarname, 4, 8), evarname = substr(evarname, 4, 8))
p <- ggplot(exposure_dist, aes(y=evarname, x=estimate, fill=evarname))
p <- p + geom_density_ridges()+ scale_fill_jama(guide="none") #+ scale_colour_continuous(guide = "none")
p <- p + geom_point(data=survey_exposure_pts, aes(y=evarname, x=estimate_adjusted, color=factor(Begin.Year))) + scale_color_aaas(name = "")
pheno_het <- p + theme_bw() + theme(legend.position='bottom') + ylab("Exposure Name") + xlab("Association Estimate")
survey_het <- plot_grid(p1, p2, p3, p4, ncol=2,labels = c('A', 'B', "C", "D"), label_size = 12)
p <- plot_grid(survey_het, pheno_het, ncol=2, labels=c("", "E"), label_size = 12)
## Picking joint bandwidth of 0.0332
R2 adjusted vs. non-adjusted
p <- ggplot(expos_wide, aes(rsq_2_1, rsq_2_0))
p <- p + geom_point(shape='.') + xlab("R2 Base [Demographics]") + ylab("R2 [Exposure + Demographics]")
p <- p + theme_bw()
p

p <- ggplot(expos_wide, aes(rsq_2_1))
p <- p + geom_histogram() + xlab("R2 Base [Demographics]")
p <- p + theme_bw()
p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p <- ggplot(expos_wide, aes(rsq_2_0, rsq_1_0))
p <- p + geom_point(shape='.') + xlab("R2 Adjusted [Exposure + Demographics]") + ylab("R2 Unadjusted [Exposure]")
p <- p + theme_bw()
p

p <- ggplot(adjusted_meta_2, aes(mean_adjusted_base_r2_diff, i.squared))
p <- p + geom_point(shape='.') + xlab("R2 Adjusted [Exposure + Demographics]") + ylab("i.squared") + scale_x_continuous(limits=c(0, .01))
p <- p + theme_bw() + facet_grid(~sig_levels)
p
## Warning: Removed 3752 rows containing missing values (geom_point).

Estimate and P-values: adjusted vs. non-adjusted
- estimate the association globally between adjusted and non-adjusted
models
p <- ggplot(expos_wide, aes(estimate_2, estimate_1))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-2, 2)) + scale_y_continuous(limits=c(-2, 2)) + xlab("Adjusted [Exposure + Demographics]") + ylab("Unadjusted [Exposure]") + geom_abline()
p <- p + geom_smooth(method="lm")
p <- p + theme_bw()
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 161 rows containing non-finite values (stat_smooth).
## Warning: Removed 161 rows containing missing values (geom_point).

## how much bias?
tidy(lm(estimate_2 ~ estimate_1, expos_wide)) # biased to be lower (intercept is negative, and slope is less than 1)
#
p <- ggplot(expos_wide, aes(-log10(p.value_2), -log10(p.value_1)))
p <- p + geom_point(shape='.', alpha=.1) + xlab("Adjusted model pvalue [Exposure + Demographics]") + ylab("Unadjusted Pvalue [Exposure]")
p <- p + geom_abline()
p <- p + theme_bw()
p
## Warning: Removed 20562 rows containing missing values (geom_point).

p <- ggplot(expos_wide, aes(statistic_2, statistic_1))
p <- p + geom_point(shape='.', alpha=.1) + xlab("Adjusted model z [Exposure + Demographics]") + ylab("Unadjusted z [Exposure]")
p <- p + geom_abline() + scale_y_continuous(limit=c(-20, 20)) + scale_x_continuous(limits=c(-20,20))
p <- p + theme_bw()
p
## Warning: Removed 1871 rows containing missing values (geom_point).

##
adjustment_number_order <- rbind(
tibble(model_number=1, scenario="base", order_number=1),
tibble(model_number=5, scenario="sex", order_number=2),
tibble(model_number=4, scenario="age", order_number=3),
tibble(model_number=3, scenario="age_sex", order_number=4),
tibble(model_number=9, scenario="ethnicity", order_number=5),
tibble(model_number=6, scenario="age_sex_ethnicity", order_number=6),
tibble(model_number=8, scenario="income_education", order_number=7),
tibble(model_number=7, scenario="age_sex_income_education", order_number=8),
tibble(model_number=2, scenario="age_sex_ethnicity_income_education", order_number=9)
)
#bonf_thresh <- 0.05/nrow(adjusted_meta_2)
adjusted_meta_tp <- adjusted_meta |> filter(nobs >= 2 ) |> select(evarname, pvarname, model_number,term_name, estimate, std.error, statistic, p.value) |> collect()
#adjusted_meta_full <- adjusted_meta_tp |> filter(model_number == 2) |> rename(estimate_adjusted=estimate, p.value_adjusted = p.value, statistic_adjusted=statistic) |> select(-model_number)
adjusted_meta_full <- adjusted_meta_2 |> rename(estimate_adjusted=estimate, p.value_adjusted = p.value, statistic_adjusted=statistic) |> select(-model_number)
adjusted_meta_tp <- adjusted_meta_tp |> left_join(adjusted_meta_full, by=c("evarname", "pvarname", "term_name"))
adjusted_meta_tp <- adjusted_meta_tp |> left_join(adjustment_number_order |> select(c(model_number, scenario, order_number)), by="model_number")
adjusted_meta_tp |> group_by(scenario) |> summarize(sd_estimate = sd(estimate-estimate_adjusted, na.rm=T))
p <- ggplot(adjusted_meta_tp, aes(statistic_adjusted, statistic))
p <- p + geom_point(shape='.') + facet_wrap(~scenario) + geom_abline(slope=1, intercept = 0) + theme_bw()
p
## Warning: Removed 45 rows containing missing values (geom_point).

p <- ggplot(adjusted_meta_tp |> filter( scenario != 'age_sex_ethnicity_income_education') |> mutate(scenario = fct_relevel(scenario, "base", "age", "sex", "age_sex", "ethnicity", "income_education", "age_sex_income_education", "age_sex_ethnicity", "age_sex_ethnicity_income_education")), aes(estimate_adjusted, estimate))
## Warning: Unknown levels in `f`: age_sex_ethnicity_income_education
p <- p + geom_point(shape='.') + facet_wrap(~scenario, ncol = 4) + theme_bw()
p
## Warning: Removed 45 rows containing missing values (geom_point).

p <- ggplot(adjusted_meta_tp, aes(estimate-estimate_adjusted))
p <- p + geom_histogram() + facet_wrap(~scenario) + theme_bw() + scale_x_continuous(limits=c(-.1, .1))
p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 45494 rows containing non-finite values (stat_bin).

# adjusted_meta_tp
am2 <- adjusted_meta_tp |> filter(model_number == 1) |> filter(!is.na(sig_levels))
p <- ggplot(am2 , aes(estimate_adjusted, estimate, color=sig_levels))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1)) + scale_y_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + geom_abline()
p <- p + facet_grid(~sig_levels) + xlab("Adjusted model estimate [Exposure + Demographics]") + ylab("Unadjusted estimate [Exposure]")
p <- p + geom_smooth(method="lm")
p <- p + theme_bw() +theme(legend.position = "none") ## need to get sig_levels
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 160 rows containing non-finite values (stat_smooth).
## Warning: Removed 160 rows containing missing values (geom_point).

#
#
#
tidy(lm(estimate_adjusted ~ estimate, am2))
tidy(lm(estimate_adjusted ~ estimate, am2 |> filter(pvalue_bonferroni < .05)))
tidy(lm(estimate_adjusted ~ estimate, am2 |> filter(sig_levels == '> BY & Bonf.')))
tidy(lm(estimate_adjusted ~ estimate, am2 |> filter(sig_levels == 'BY<0.05')))
#
p <- ggplot(adjusted_meta_tp |> filter(model_number == 1, !is.na(ecategory)) , aes(estimate, estimate_adjusted, color=ecategory))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1)) + scale_y_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + geom_abline()
p <- p + facet_grid(~ecategory) + ylab("Adjusted model estimate [Exposure + Demographics]") + xlab("Unadjusted estimate [Exposure]")
p <- p + geom_smooth(method="lm")
p <- p + theme_bw() +theme(legend.position = "none")
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 160 rows containing non-finite values (stat_smooth).
## Removed 160 rows containing missing values (geom_point).

p <- ggplot(adjusted_meta_tp |> filter(model_number == 1, !is.na(ecategory)) , aes(estimate-estimate_adjusted, color=ecategory))
p <- p + geom_histogram() + scale_x_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + facet_grid(~ecategory)
p <- p + theme_bw() +theme(legend.position = "none")
p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## Warning: Removed 16 rows containing missing values (geom_bar).

p <- ggplot(adjusted_meta_tp |> filter(model_number == 1, !is.na(pcategory)) , aes(estimate, estimate_adjusted, color=pcategory))
p <- p + geom_point(shape='.') + scale_x_continuous(limits=c(-1, 1)) + scale_y_continuous(limits=c(-1, 1)) + scale_color_aaas()
p <- p + geom_abline()
p <- p + facet_grid(~pcategory) + ylab("Adjusted model estimate [Exposure + Demographics]") + xlab("Unadjusted estimate [Exposure]")
p <- p + geom_smooth(method="lm")
p <- p + theme_bw() +theme(legend.position = "none")
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 160 rows containing non-finite values (stat_smooth).
## Warning: Removed 160 rows containing missing values (geom_point).
